From: Damian Miralles Date: Wed, 13 Dec 2017 20:27:17 +0000 (-0700) Subject: kernel: Adds unaligned protokernles to `32f_x2_s32f_interleave_16ic` and `32f_x2_subt... X-Git-Tag: archive/raspbian/1.3-3+rpi1^2~10 X-Git-Url: https://dgit.raspbian.org/%22http://www.example.com/cgi/success//%22http:/www.example.com/cgi/success/?a=commitdiff_plain;h=72c8794013716523a6d2c1f8c63f68e4c04eafdd;p=volk.git kernel: Adds unaligned protokernles to `32f_x2_s32f_interleave_16ic` and `32f_x2_subtract_32f` Adds unaligned versions to the afore mentioned kernels, relative speeds improvements shown in both cases. Gbp-Pq: Name 0015-kernel-Adds-unaligned-protokernles-to-32f_x2_s32f_in.patch --- diff --git a/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h b/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h index 99f1b5e..20f66ff 100644 --- a/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h +++ b/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h @@ -214,3 +214,66 @@ volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector, const float* #endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H */ + +#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H +#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H + +#include +#include +#include + +#ifdef LV_HAVE_AVX2 +#include + +static inline void +volk_32f_x2_s32f_interleave_16ic_u_avx2(lv_16sc_t* complexVector, const float* iBuffer, + const float* qBuffer, const float scalar, unsigned int num_points) +{ + unsigned int number = 0; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + + __m256 vScalar = _mm256_set1_ps(scalar); + + const unsigned int eighthPoints = num_points / 8; + + __m256 iValue, qValue, cplxValue1, cplxValue2; + __m256i intValue1, intValue2; + + int16_t* complexVectorPtr = (int16_t*)complexVector; + + for(;number < eighthPoints; number++){ + iValue = _mm256_loadu_ps(iBufferPtr); + qValue = _mm256_loadu_ps(qBufferPtr); + + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue1 = _mm256_unpacklo_ps(iValue, qValue); + cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar); + + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue2 = _mm256_unpackhi_ps(iValue, qValue); + cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar); + + intValue1 = _mm256_cvtps_epi32(cplxValue1); + intValue2 = _mm256_cvtps_epi32(cplxValue2); + + intValue1 = _mm256_packs_epi32(intValue1, intValue2); + + _mm256_storeu_si256((__m256i*)complexVectorPtr, intValue1); + complexVectorPtr += 16; + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + complexVectorPtr = (int16_t*)(&complexVector[number]); + for(; number < num_points; number++){ + *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); + } +} +#endif /* LV_HAVE_AVX2 */ + + +#endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H */ diff --git a/kernels/volk/volk_32f_x2_subtract_32f.h b/kernels/volk/volk_32f_x2_subtract_32f.h index 4a452fd..b7f36cf 100644 --- a/kernels/volk/volk_32f_x2_subtract_32f.h +++ b/kernels/volk/volk_32f_x2_subtract_32f.h @@ -176,3 +176,48 @@ volk_32f_x2_subtract_32f_u_orc(float* cVector, const float* aVector, #endif /* INCLUDED_volk_32f_x2_subtract_32f_a_H */ + + +#ifndef INCLUDED_volk_32f_x2_subtract_32f_u_H +#define INCLUDED_volk_32f_x2_subtract_32f_u_H + +#include +#include + +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32f_x2_subtract_32f_u_avx(float* cVector, const float* aVector, + const float* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; + + __m256 aVal, bVal, cVal; + for(;number < eighthPoints; number++){ + + aVal = _mm256_loadu_ps(aPtr); + bVal = _mm256_loadu_ps(bPtr); + + cVal = _mm256_sub_ps(aVal, bVal); + + _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) - (*bPtr++); + } +} +#endif /* LV_HAVE_AVX */ + +#endif /* INCLUDED_volk_32f_x2_subtract_32f_u_H */